#!/usr/bin/env python

# This script processes DaCiDian
# FROM:
# 	layer-1 mapping: DaCiDian/word_to_pinyin.txt
# 	layer-2 mapping: DaCiDian/pinyin_to_phone.txt
# TO: 
# 	lexicon.txt

import sys


def split_base_tone(str_syllable):
    len_str = (len(str_syllable))
    base_str = str_syllable[0:(len_str - 1)]
    tone_str = str_syllable[(len_str - 1):len_str]
    return base_str, tone_str


def gen_py2phone_map(pinyin_to_phone_file):
    map_py2phone = {}
    for l in open(pinyin_to_phone_file, encoding='utf-8'):  # "ZHENG	zh eng"
        cols = l.strip().split('\t')
        assert (len(cols) == 2)
        syllable = cols[0]
        phones = cols[1].split()
        map_py2phone[syllable] = phones
        # print(phones)
    return map_py2phone


def pinyin2phone(syllable, map_py2phone):
    base, tone = split_base_tone(syllable)
    phones = [phn for phn in map_py2phone[base]]
    phones[-1] = phones[-1] + '_' + tone
    return phones


pinyin_to_phone_file = "pinyin_to_phone.txt"  # layer-2 mapping
map_py2phone = gen_py2phone_map(pinyin_to_phone_file)


def main():
    phones_list = []
    pinyin_file = "train.syllable.txt"
    f_phone_seq = open('phone_seqs.txt', 'w', encoding="utf-8")
    for l in open(pinyin_file, encoding='utf-8'):
        cols = l.strip().split(' ')
        word = cols[0]
        phone_seq = []
        for i in range(len(cols)):
            if i is 0:
                continue
            phones = pinyin2phone(cols[i], map_py2phone)
            phone_seq.extend(phones)
            phone_seq.extend(";")
        phones_list.extend(phone_seq)
        f_phone_seq.write(word + ' ' + ' '.join(phone_seq) + '\n')
        # sys.stdout.write(word + '\t' + ' '.join(phone_seq) + '\n')
    f_phone_seq.close()
    print(len(phones_list), phones_list)
    phones_set = set(phones_list)
    print(len(phones_set), phones_set)
    f = open('phones.txt', 'w', encoding="utf-8")  # 若是'wb'就表示写二进制文件
    index = 1
    for x in phones_set:
        str_map = "%s:%d\n" % (x, index)
        f.write(str_map)
        index += 1
    f.close()


def get_phone_dict():
    dict_all = {}  # 创建一个空字典
    phones = "phones.txt"
    for l in open(phones, encoding='utf-8'):
        cols = l.strip().split(':')
        one_dict = {cols[0]: cols[1]}
        dict_all.update(one_dict)
    return dict_all


def gen_phones_data():
    phone_dict = get_phone_dict()
    phone_seqs = "phone_seqs.txt"
    f_phone_datas = open('phone_datas.txt', 'w', encoding="utf-8")
    for l in open(phone_seqs, encoding='utf-8'):
        cols = l.strip().split(' ')
        word = cols[0]
        phone_data = []
        for i in range(len(cols)):
            if i is 0:
                continue
            # print(cols[i])
            num = int(phone_dict[cols[i]])
            phone_data.append(str(num))
        # sys.stdout.write(' '.join(phone_data) + '\n')
        f_phone_datas.write(word + ' ' + ' '.join(phone_data) + '\n')
    f_phone_datas.close()


if __name__ == "__main__":
    main()
    gen_phones_data()
